Set Up

library(tidyverse)
library(here)
library(hrbrthemes)
library(janitor)
library(corrplot)

RNGkind(sample.kind = "Rounding")
set.seed(1)

theme_set(theme_ipsum())
credit <- as_tibble(read_csv(here("data", "creditcard.csv")))

EDA

head(credit)

Variable Summary

anyNA(credit)
[1] FALSE
for (i in 1:ncol(credit)){
  print(summary(credit[, i]))
}
      Time       
 Min.   :     0  
 1st Qu.: 54202  
 Median : 84692  
 Mean   : 94814  
 3rd Qu.:139320  
 Max.   :172792  
       V1           
 Min.   :-56.40751  
 1st Qu.: -0.92037  
 Median :  0.01811  
 Mean   :  0.00000  
 3rd Qu.:  1.31564  
 Max.   :  2.45493  
       V2           
 Min.   :-72.71573  
 1st Qu.: -0.59855  
 Median :  0.06549  
 Mean   :  0.00000  
 3rd Qu.:  0.80372  
 Max.   : 22.05773  
       V3          
 Min.   :-48.3256  
 1st Qu.: -0.8904  
 Median :  0.1799  
 Mean   :  0.0000  
 3rd Qu.:  1.0272  
 Max.   :  9.3826  
       V4          
 Min.   :-5.68317  
 1st Qu.:-0.84864  
 Median :-0.01985  
 Mean   : 0.00000  
 3rd Qu.: 0.74334  
 Max.   :16.87534  
       V5            
 Min.   :-113.74331  
 1st Qu.:  -0.69160  
 Median :  -0.05434  
 Mean   :   0.00000  
 3rd Qu.:   0.61193  
 Max.   :  34.80167  
       V6          
 Min.   :-26.1605  
 1st Qu.: -0.7683  
 Median : -0.2742  
 Mean   :  0.0000  
 3rd Qu.:  0.3986  
 Max.   : 73.3016  
       V7          
 Min.   :-43.5572  
 1st Qu.: -0.5541  
 Median :  0.0401  
 Mean   :  0.0000  
 3rd Qu.:  0.5704  
 Max.   :120.5895  
       V8           
 Min.   :-73.21672  
 1st Qu.: -0.20863  
 Median :  0.02236  
 Mean   :  0.00000  
 3rd Qu.:  0.32735  
 Max.   : 20.00721  
       V9           
 Min.   :-13.43407  
 1st Qu.: -0.64310  
 Median : -0.05143  
 Mean   :  0.00000  
 3rd Qu.:  0.59714  
 Max.   : 15.59500  
      V10           
 Min.   :-24.58826  
 1st Qu.: -0.53543  
 Median : -0.09292  
 Mean   :  0.00000  
 3rd Qu.:  0.45392  
 Max.   : 23.74514  
      V11          
 Min.   :-4.79747  
 1st Qu.:-0.76249  
 Median :-0.03276  
 Mean   : 0.00000  
 3rd Qu.: 0.73959  
 Max.   :12.01891  
      V12          
 Min.   :-18.6837  
 1st Qu.: -0.4056  
 Median :  0.1400  
 Mean   :  0.0000  
 3rd Qu.:  0.6182  
 Max.   :  7.8484  
      V13          
 Min.   :-5.79188  
 1st Qu.:-0.64854  
 Median :-0.01357  
 Mean   : 0.00000  
 3rd Qu.: 0.66251  
 Max.   : 7.12688  
      V14          
 Min.   :-19.2143  
 1st Qu.: -0.4256  
 Median :  0.0506  
 Mean   :  0.0000  
 3rd Qu.:  0.4931  
 Max.   : 10.5268  
      V15          
 Min.   :-4.49894  
 1st Qu.:-0.58288  
 Median : 0.04807  
 Mean   : 0.00000  
 3rd Qu.: 0.64882  
 Max.   : 8.87774  
      V16           
 Min.   :-14.12985  
 1st Qu.: -0.46804  
 Median :  0.06641  
 Mean   :  0.00000  
 3rd Qu.:  0.52330  
 Max.   : 17.31511  
      V17           
 Min.   :-25.16280  
 1st Qu.: -0.48375  
 Median : -0.06568  
 Mean   :  0.00000  
 3rd Qu.:  0.39968  
 Max.   :  9.25353  
      V18           
 Min.   :-9.498746  
 1st Qu.:-0.498850  
 Median :-0.003636  
 Mean   : 0.000000  
 3rd Qu.: 0.500807  
 Max.   : 5.041069  
      V19           
 Min.   :-7.213527  
 1st Qu.:-0.456299  
 Median : 0.003735  
 Mean   : 0.000000  
 3rd Qu.: 0.458949  
 Max.   : 5.591971  
      V20           
 Min.   :-54.49772  
 1st Qu.: -0.21172  
 Median : -0.06248  
 Mean   :  0.00000  
 3rd Qu.:  0.13304  
 Max.   : 39.42090  
      V21           
 Min.   :-34.83038  
 1st Qu.: -0.22839  
 Median : -0.02945  
 Mean   :  0.00000  
 3rd Qu.:  0.18638  
 Max.   : 27.20284  
      V22            
 Min.   :-10.933144  
 1st Qu.: -0.542350  
 Median :  0.006782  
 Mean   :  0.000000  
 3rd Qu.:  0.528554  
 Max.   : 10.503090  
      V23           
 Min.   :-44.80774  
 1st Qu.: -0.16185  
 Median : -0.01119  
 Mean   :  0.00000  
 3rd Qu.:  0.14764  
 Max.   : 22.52841  
      V24          
 Min.   :-2.83663  
 1st Qu.:-0.35459  
 Median : 0.04098  
 Mean   : 0.00000  
 3rd Qu.: 0.43953  
 Max.   : 4.58455  
      V25           
 Min.   :-10.29540  
 1st Qu.: -0.31715  
 Median :  0.01659  
 Mean   :  0.00000  
 3rd Qu.:  0.35072  
 Max.   :  7.51959  
      V26          
 Min.   :-2.60455  
 1st Qu.:-0.32698  
 Median :-0.05214  
 Mean   : 0.00000  
 3rd Qu.: 0.24095  
 Max.   : 3.51735  
      V27            
 Min.   :-22.565679  
 1st Qu.: -0.070840  
 Median :  0.001342  
 Mean   :  0.000000  
 3rd Qu.:  0.091045  
 Max.   : 31.612198  
      V28           
 Min.   :-15.43008  
 1st Qu.: -0.05296  
 Median :  0.01124  
 Mean   :  0.00000  
 3rd Qu.:  0.07828  
 Max.   : 33.84781  
     Amount        
 Min.   :    0.00  
 1st Qu.:    5.60  
 Median :   22.00  
 Mean   :   88.35  
 3rd Qu.:   77.17  
 Max.   :25691.16  
     Class         
 Min.   :0.000000  
 1st Qu.:0.000000  
 Median :0.000000  
 Mean   :0.001728  
 3rd Qu.:0.000000  
 Max.   :1.000000  

Distributions

for (i in names(credit[, -31])) {
  p <- ggplot(credit, aes_string(x = i)) +
    geom_density(fill = "cornsilk")

  print(p)
}

Relationship with Response

ggplot(data = credit, aes(x = Time, fill = Class)) +
  geom_histogram() +
  facet_wrap(~Class, scales = "free")

ggplot(data = credit, aes(x = log(Amount), fill = Class)) +
  geom_histogram() +
  facet_wrap(~Class, scales = "free")

ggplot(data = credit, aes(x = Time, y = log(Amount), alpha = 0.2)) +
  geom_point() +
  facet_wrap(~Class, scales = "free")

There is a clear skew to the Amount variable, so it is worth applying a transformation to the data. As there are values of 0, we need to add 1 to ensure that we don’t get Inf values produced after log transformation.

credit <- credit %>%
  mutate(log_amount = log(Amount + 1))

summary(credit$log_amount)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   1.887   3.135   3.152   4.359  10.154 
ggplot(credit, aes(x = log_amount)) +
    geom_density(fill = "cornsilk")

Predictor Correlations

corrplot(cor(credit[, -31]), method = "square", type = "upper")

Transformation of Amount has helped to reduce collinearities of the predictors. The only correlations are between V3 and Time, and V2 and log_amount. This is as expected as PCA produces orthogonal linear combinations, therefore there shouldn’t be much correlation between them.

LS0tCnRpdGxlOiAiU1RBVCA1MDggRmluYWwgUHJvamVjdCBFREEiCmF1dGhvcjogIkNhbGx1bSBBcm5vbGQiCm91dHB1dDoKICAgIGh0bWxfbm90ZWJvb2s6CiAgICAgICAgY29kZV9mb2xkaW5nOiBoaWRlCiAgICAgICAgdG9jOiB5ZXMKICAgICAgICB0b2NfZmxvYXQ6IHllcwotLS0KCiMgU2V0IFVwCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0Ka25pdHI6Om9wdHNfY2h1bmskc2V0KGVjaG8gPSBUUlVFLCB3YXJuaW5nID0gRkFMU0UsIG1lc3NhZ2UgPSBGQUxTRSwgZmlnLndpZHRoID0gMTQpCmBgYAoKYGBge3J9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KGhlcmUpCmxpYnJhcnkoaHJicnRoZW1lcykKbGlicmFyeShqYW5pdG9yKQpsaWJyYXJ5KGNvcnJwbG90KQoKUk5Ha2luZChzYW1wbGUua2luZCA9ICJSb3VuZGluZyIpCnNldC5zZWVkKDEpCgp0aGVtZV9zZXQodGhlbWVfaXBzdW0oKSkKYGBgCgpgYGB7cn0KY3JlZGl0IDwtIGFzX3RpYmJsZShyZWFkX2NzdihoZXJlKCJkYXRhIiwgImNyZWRpdGNhcmQuY3N2IikpKQpgYGAKCiMgRURBCgpgYGB7cn0KaGVhZChjcmVkaXQpCmBgYAoKIyMgVmFyaWFibGUgU3VtbWFyeQoKYGBge3J9CmFueU5BKGNyZWRpdCkKYGBgCgpgYGB7cn0KZm9yIChpIGluIDE6bmNvbChjcmVkaXQpKXsKICBwcmludChzdW1tYXJ5KGNyZWRpdFssIGldKSkKfQpgYGAKCiMjIyBEaXN0cmlidXRpb25zCgpgYGB7cn0KZm9yIChpIGluIG5hbWVzKGNyZWRpdFssIC0zMV0pKSB7CiAgcCA8LSBnZ3Bsb3QoY3JlZGl0LCBhZXNfc3RyaW5nKHggPSBpKSkgKwogICAgZ2VvbV9kZW5zaXR5KGZpbGwgPSAiY29ybnNpbGsiKQoKICBwcmludChwKQp9CmBgYAoKIyMgUmVsYXRpb25zaGlwIHdpdGggUmVzcG9uc2UKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNyZWRpdCwgYWVzKHggPSBUaW1lLCBmaWxsID0gQ2xhc3MpKSArCiAgZ2VvbV9oaXN0b2dyYW0oKSArCiAgZmFjZXRfd3JhcCh+Q2xhc3MsIHNjYWxlcyA9ICJmcmVlIikKYGBgCgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBjcmVkaXQsIGFlcyh4ID0gbG9nKEFtb3VudCksIGZpbGwgPSBDbGFzcykpICsKICBnZW9tX2hpc3RvZ3JhbSgpICsKICBmYWNldF93cmFwKH5DbGFzcywgc2NhbGVzID0gImZyZWUiKQpgYGAKCmBgYHtyfQpnZ3Bsb3QoZGF0YSA9IGNyZWRpdCwgYWVzKHggPSBUaW1lLCB5ID0gbG9nKEFtb3VudCksIGFscGhhID0gMC4yKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZmFjZXRfd3JhcCh+Q2xhc3MsIHNjYWxlcyA9ICJmcmVlIikKYGBgClRoZXJlIGlzIGEgY2xlYXIgc2tldyB0byB0aGUgYEFtb3VudGAgdmFyaWFibGUsIHNvIGl0IGlzIHdvcnRoIGFwcGx5aW5nIGEgdHJhbnNmb3JtYXRpb24gdG8gdGhlIGRhdGEuIApBcyB0aGVyZSBhcmUgdmFsdWVzIG9mIGAwYCwgd2UgbmVlZCB0byBhZGQgYDFgIHRvIGVuc3VyZSB0aGF0IHdlIGRvbid0IGdldCBgSW5mYCB2YWx1ZXMgcHJvZHVjZWQgYWZ0ZXIgbG9nIHRyYW5zZm9ybWF0aW9uLgoKYGBge3J9CmNyZWRpdCA8LSBjcmVkaXQgJT4lCiAgbXV0YXRlKGxvZ19hbW91bnQgPSBsb2coQW1vdW50ICsgMSkpCgpzdW1tYXJ5KGNyZWRpdCRsb2dfYW1vdW50KQpgYGAKCmBgYHtyfQpnZ3Bsb3QoY3JlZGl0LCBhZXMoeCA9IGxvZ19hbW91bnQpKSArCiAgICBnZW9tX2RlbnNpdHkoZmlsbCA9ICJjb3Juc2lsayIpCmBgYAoKIyMgUHJlZGljdG9yIENvcnJlbGF0aW9ucwoKYGBge3J9CmNvcnJwbG90KGNvcihjcmVkaXRbLCAtMzFdKSwgbWV0aG9kID0gInNxdWFyZSIsIHR5cGUgPSAidXBwZXIiKQpgYGAKClRyYW5zZm9ybWF0aW9uIG9mIGBBbW91bnRgIGhhcyBoZWxwZWQgdG8gcmVkdWNlIGNvbGxpbmVhcml0aWVzIG9mIHRoZSBwcmVkaWN0b3JzLgpUaGUgb25seSBjb3JyZWxhdGlvbnMgYXJlIGJldHdlZW4gYFYzYCBhbmQgYFRpbWVgLCBhbmQgYFYyYCBhbmQgYGxvZ19hbW91bnRgLgpUaGlzIGlzIGFzIGV4cGVjdGVkIGFzIFBDQSBwcm9kdWNlcyBvcnRob2dvbmFsIGxpbmVhciBjb21iaW5hdGlvbnMsCnRoZXJlZm9yZSB0aGVyZSBzaG91bGRuJ3QgYmUgbXVjaCBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZW0uCgo=